From 06d17943f0cd4f1386e2dfbfbffa3e29bd458b10 Mon Sep 17 00:00:00 2001 From: "emellor@leeni.uk.xensource.com" Date: Thu, 2 Mar 2006 02:09:23 +0100 Subject: [PATCH] Added a basic integrity checker, and some basic ability to recover from store corruption, rather than just spewing error messages and exiting. Added a xenstore-control executable, which sends commands to xenstored. Currently, the only command is 'check', which triggers an integrity check. (The integrity check is also triggered whenever a corrupted store is detected). Signed-off-by: Ewan Mellor --- .hgignore | 1 + tools/xenstore/Makefile | 18 +++- tools/xenstore/xenstore_control.c | 28 +++++ tools/xenstore/xenstored_core.c | 169 ++++++++++++++++++++++-------- tools/xenstore/xenstored_core.h | 4 - 5 files changed, 168 insertions(+), 52 deletions(-) create mode 100644 tools/xenstore/xenstore_control.c diff --git a/.hgignore b/.hgignore index 1d6138311f..3e0b2e8974 100644 --- a/.hgignore +++ b/.hgignore @@ -166,6 +166,7 @@ ^tools/xenstore/xenstore-read$ ^tools/xenstore/xenstore-rm$ ^tools/xenstore/xenstore-write$ +^tools/xenstore/xenstore-control$ ^tools/xenstore/xenstore-ls$ ^tools/xenstore/xenstored$ ^tools/xenstore/xenstored_test$ diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile index a6ba3ee2dc..ea50b86acc 100644 --- a/tools/xenstore/Makefile +++ b/tools/xenstore/Makefile @@ -27,7 +27,10 @@ CLIENTS := xenstore-exists xenstore-list xenstore-read xenstore-rm CLIENTS += xenstore-write CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS)) -all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls + +test_interleaved_transactions: test_interleaved_transactions.o + $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ testcode: xs_test xenstored_test xs_random @@ -35,13 +38,16 @@ xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_trans $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@ $(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so - $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@ + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c $(COMPILE.c) -DCLIENT_$(*F) -o $@ $< +xenstore-control: xenstore_control.o libxenstore.so + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ + xenstore-ls: xsls.o libxenstore.so - $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@ + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@ @@ -77,7 +83,8 @@ libxenstore.so: xs.opic xs_lib.opic clean: testsuite-clean rm -f *.o *.opic *.so rm -f xenstored xs_random xs_stress xs_crashme - rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS) + rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls + rm -f $(CLIENTS) $(RM) $(PROG_DEP) print-dir: @@ -129,7 +136,7 @@ TAGS: tarball: clean cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/ -install: libxenstore.so xenstored xenstore-ls $(CLIENTS) +install: all $(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored $(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored $(INSTALL_DIR) -p $(DESTDIR)/usr/bin @@ -137,6 +144,7 @@ install: libxenstore.so xenstored xenstore-ls $(CLIENTS) $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin $(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin + $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR) diff --git a/tools/xenstore/xenstore_control.c b/tools/xenstore/xenstore_control.c new file mode 100644 index 0000000000..80685c59d7 --- /dev/null +++ b/tools/xenstore/xenstore_control.c @@ -0,0 +1,28 @@ +#include +#include +#include + +#include "xs.h" + + +int main(int argc, char **argv) +{ + if (argc < 2 || + strcmp(argv[1], "check")) + { + fprintf(stderr, + "Usage:\n" + "\n" + " %s check\n" + "\n", argv[0]); + return 2; + } + + struct xs_handle * xsh = xs_daemon_open(); + + xs_debug_command(xsh, argv[1], NULL, 0); + + xs_daemon_close(xsh); + + return 0; +} diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c index f9629626d9..a88d09efa7 100644 --- a/tools/xenstore/xenstored_core.c +++ b/tools/xenstore/xenstored_core.c @@ -60,6 +60,18 @@ static int reopen_log_pipe[2]; static char *tracefile = NULL; static TDB_CONTEXT *tdb_ctx; +static void corrupt(struct connection *conn, const char *fmt, ...); +static void check_store(); + +#define log(...) \ + do { \ + char *s = talloc_asprintf(NULL, __VA_ARGS__); \ + trace("%s\n", s); \ + syslog(LOG_ERR, "%s", s); \ + talloc_free(s); \ + } while (0) + + #ifdef TESTING static bool failtest = false; @@ -104,33 +116,6 @@ int test_mkdir(const char *dir, int perms) #include "xenstored_test.h" -/* FIXME: Ideally, this should never be called. Some can be eliminated. */ -/* Something is horribly wrong: shutdown immediately. */ -void __attribute__((noreturn)) corrupt(struct connection *conn, - const char *fmt, ...) -{ - va_list arglist; - char *str; - int saved_errno = errno; - - va_start(arglist, fmt); - str = talloc_vasprintf(NULL, fmt, arglist); - va_end(arglist); - - trace("xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); - eprintf("xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); -#ifdef TESTING - /* Allow them to attach debugger. */ - sleep(30); -#endif - syslog(LOG_DAEMON, - "xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); - _exit(2); -} - TDB_CONTEXT *tdb_context(struct connection *conn) { /* conn = NULL used in manual_node at setup. */ @@ -216,7 +201,8 @@ static void trace_io(const struct connection *conn, now = time(NULL); tm = localtime(&now); - trace("%s %p %02d:%02d:%02d %s (", prefix, conn, + trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn, + conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday, tm->tm_hour, tm->tm_min, tm->tm_sec, sockmsg_string(data->hdr.msg.type)); @@ -837,8 +823,6 @@ static int destroy_node(void *_node) return 0; } -/* Be careful: create heirarchy, put entry in existing parent *last*. - * This helps fsck if we die during this. */ static struct node *create_node(struct connection *conn, const char *name, void *data, unsigned int datalen) @@ -939,8 +923,9 @@ static void delete_node(struct connection *conn, struct node *node) { unsigned int i; - /* Delete self, then delete children. If something goes wrong, - * consistency check will clean up this way. */ + /* Delete self, then delete children. If we crash, then the worst + that can happen is the children will continue to take up space, but + will otherwise be unreachable. */ delete_node_single(conn, node); /* Delete children, too. */ @@ -950,9 +935,14 @@ static void delete_node(struct connection *conn, struct node *node) child = read_node(conn, talloc_asprintf(node, "%s/%s", node->name, node->children + i)); - if (!child) - corrupt(conn, "No child '%s' found", child); - delete_node(conn, child); + if (child) { + delete_node(conn, child); + } + else { + trace("delete_node: No child '%s/%s' found!\n", + node->name, node->children + i); + /* Skip it, we've already deleted the parent. */ + } } } @@ -976,12 +966,15 @@ static bool delete_child(struct connection *conn, } } corrupt(conn, "Can't find child '%s' in %s", childname, node->name); + return false; } static int _rm(struct connection *conn, struct node *node, const char *name) { - /* Delete from parent first, then if something explodes fsck cleans. */ + /* Delete from parent first, then if we crash, the worst that can + happen is the child will continue to take up space, but will + otherwise be unreachable. */ struct node *parent = read_node(conn, get_parent(name)); if (!parent) { send_error(conn, EINVAL); @@ -1000,10 +993,11 @@ static int _rm(struct connection *conn, struct node *node, const char *name) static void internal_rm(const char *name) { - char *tname = talloc_strdup(talloc_autofree_context(), name); + char *tname = talloc_strdup(NULL, name); struct node *node = read_node(NULL, tname); if (node) _rm(NULL, node, tname); + talloc_free(tname); } @@ -1149,18 +1143,19 @@ static void process_message(struct connection *conn, struct buffered_data *in) case XS_DEBUG: if (streq(in->buffer, "print")) xprintf("debug: %s", in->buffer + get_string(in, 0)); + if (streq(in->buffer, "check")) + check_store(); #ifdef TESTING /* For testing, we allow them to set id. */ if (streq(in->buffer, "setid")) { conn->id = atoi(in->buffer + get_string(in, 0)); - send_ack(conn, XS_DEBUG); } else if (streq(in->buffer, "failtest")) { if (get_string(in, 0) < in->used) srandom(atoi(in->buffer + get_string(in, 0))); - send_ack(conn, XS_DEBUG); failtest = true; } #endif /* TESTING */ + send_ack(conn, XS_DEBUG); break; case XS_WATCH: @@ -1258,7 +1253,7 @@ static void handle_input(struct connection *conn) if (in->hdr.msg.len > PATH_MAX) { #ifndef TESTING - syslog(LOG_DAEMON, "Client tried to feed us %i", + syslog(LOG_ERR, "Client tried to feed us %i", in->hdr.msg.len); #endif goto bad_client; @@ -1425,10 +1420,16 @@ static void setup_structure(void) balloon driver will pick up stale entries. In the case of the balloon driver, this can be fatal. */ - char *tlocal = talloc_strdup(talloc_autofree_context(), - "/local"); + char *tlocal = talloc_strdup(NULL, "/local"); + + check_store(); + internal_rm("/local"); create_node(NULL, tlocal, NULL, 0); + + talloc_free(tlocal); + + check_store(); } else { tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT, @@ -1439,11 +1440,93 @@ static void setup_structure(void) manual_node("/", "tool"); manual_node("/tool", "xenstored"); manual_node("/tool/xenstored", NULL); + + check_store(); + } +} + +static char *child_name(const char *s1, const char *s2) +{ + if (strcmp(s1, "/")) { + return talloc_asprintf(NULL, "%s/%s", s1, s2); + } + else { + return talloc_asprintf(NULL, "/%s", s2); + } +} + +static void check_store_(const char *name) +{ + struct node *node = read_node(NULL, name); + + if (node) { + size_t i = 0; + + while (i < node->childlen) { + size_t childlen = strlen(node->children + i); + char * childname = child_name(node->name, + node->children + i); + struct node *childnode = read_node(NULL, childname); + + if (childnode) { + check_store_(childname); + i += childlen + 1; + } + else { + log("check_store: No child '%s' found!\n", + childname); + + memdel(node->children, i, childlen + 1, + node->childlen); + node->childlen -= childlen + 1; + write_node(NULL, node); + } + + talloc_free(childname); + } + } + else { + /* Impossible, because no database should ever be without the + root, and otherwise, we've just checked in our caller + (which made a recursive call to get here). */ + + log("check_store: No child '%s' found: impossible!", name); } +} + + +static void check_store() +{ + char * root = talloc_strdup(NULL, "/"); + log("Checking store ..."); + check_store_(root); + log("Checking store complete."); + talloc_free(root); +} - /* FIXME: Fsck */ + +/* Something is horribly wrong: check the store. */ +static void corrupt(struct connection *conn, const char *fmt, ...) +{ + va_list arglist; + char *str; + int saved_errno = errno; + + va_start(arglist, fmt); + str = talloc_vasprintf(NULL, fmt, arglist); + va_end(arglist); + + log("corruption detected by connection %i: err %s: %s", + conn ? (int)conn->id : -1, strerror(saved_errno), str); + +#ifdef TESTING + /* Allow them to attach debugger. */ + sleep(30); +#endif + check_store(); } + static void write_pidfile(const char *pidfile) { char buf[100]; diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h index da98a9211d..090f54a846 100644 --- a/tools/xenstore/xenstored_core.h +++ b/tools/xenstore/xenstored_core.h @@ -148,10 +148,6 @@ int destroy_tdb(void *_tdb); /* Replace the tdb: required for transaction code */ bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb); -/* Fail due to excessive corruption, capitalist pigdogs! */ -void __attribute__((noreturn)) corrupt(struct connection *conn, - const char *fmt, ...); - struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); -- 2.30.2